# Kernel: xconv_blocksparse_32x32x16_fprop

[-
our ($type, $F16, $I16, $O16, $overlapK);
sub F16 { return $type eq 'h' || $F16; }
sub I16 { return $type eq 'h' || $I16; }
sub O16 { return $type eq 'h' || $O16; }

our $dtypeF  = F16() ?  'U16' :  '32';
our $dtypeI  = I16() ?  'U16' :  '32';
our $dtypeO  = O16() ?  'U16' :  '32';

our $dshiftF = F16() ?    '1' :   '2';
our $dshiftI = I16() ?    '1' :   '2';
our $dshiftO = O16() ?    '1' :   '2';

our $dsizeF  = F16() ?    '2' :   '4';
our $dsizeI  = I16() ?    '2' :   '4';
our $dsizeO  = O16() ?    '2' :   '4';

sub dtypeF  { return $dtypeF;  }
sub dtypeI  { return $dtypeI;  }
sub dtypeO  { return $dtypeO;  }

sub dsizeF  { return $dsizeF;  }
sub dsizeI  { return $dsizeI;  }
sub dsizeO  { return $dsizeO;  }

sub dshiftF { return $dshiftF; }
sub dshiftI { return $dshiftI; }
sub dshiftO { return $dshiftO; }

sub overlapK  { return $overlapK; }
-]

<CONSTANT_MAPPING>

    addr_lut   : 4x<(17 + 16)*32*2>
    szShareF   : (17*32)
    szShareI   : (16*32)

    param_Block[0]  : c[0x0][0x140]
    param_Block[1]  : c[0x0][0x144]
    param_LutMPQ[0] : c[0x0][0x148]
    param_LutMPQ[1] : c[0x0][0x14c]
    param_LutCK[0]  : c[0x0][0x150]
    param_LutCK[1]  : c[0x0][0x154]
    param_O[0]      : c[0x0][0x158]
    param_O[1]      : c[0x0][0x15c]
    param_F[0]      : c[0x0][0x160]
    param_F[1]      : c[0x0][0x164]
    param_I[0]      : c[0x0][0x168]
    param_I[1]      : c[0x0][0x16c]
    param_alpha     : c[0x0][0x170]
    param_TRS       : c[0x0][0x174]
    param_magic_TRS : c[0x0][0x178]
    param_shift_TRS : c[0x0][0x17c]
    param_CDHW      : c[0x0][0x180]
    param_KMPQ      : c[0x0][0x184]

</CONSTANT_MAPPING>

<REGISTER_MAPPING>

        0-3 ~ idx_Blk

       0-63 : czero<00-63>

     3, 2,11,10,19,18,27,26 : cx<0-7>y0
     7, 6,15,14,23,22,31,30 : cx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
     5, 4,13,12,21,20,29,28 : cx<0-7>y3
    35,34,43,42,51,50,59,58 : cx<0-7>y4
    39,38,47,46,55,54,63,62 : cx<0-7>y5
    33,32,41,40,49,48,57,56 : cx<0-7>y6
    37,36,45,44,53,52,61,60 : cx<0-7>y7

      64-79 : j0Fy<0-7>, j0Ix<0-7>
      80-95 : j1Fy<0-7>, j1Ix<0-7>

     96-111 : F00_<0-3>, F16_<0-3>, I0<0-3>, I8<0-3>
    112-115 : track00F<0-1>, track16F<0-1>
    116-131 : track0I<0-7>, track8I<0-7>
    132-139 : slice0I<0-3>, slice8I<0-3>
    132-135 : block<0-3>

    132 = trs0, shareMPQ0
    136 = trs8, shareMPQ8
    140 = c0, shareC0, offset0Ic
    141 = c8, shareC8, offset8Ic

    142-159 ~ writeFs, writeIs, readFs, readIs, swapBuf, k00, k16, CTRS, CTRS16, posCTRS, shareLutC, shareLutMPQ, offsetIn
    160-167 ~ tid, idx_N, idx_K, block_K, TRS, shareLutK, writeOs

      64-65 : Block<0-1>
      66-73 ~ tid1, tid16, tid16_1, readIs2, block_F, offsetK<00|16>, tidFX

    132-139 : lutEntry08<0-3>, lutEntry12<0-3>
      64-71 : lutEntry00<0-3>, lutEntry04<0-3>
      64-71 ~ ck<0-3>, cEntry<0-3>
      72-73 : Lut<0-1>
      74-88 ~ mpqOffset, lutStore, block_C, block_CK, idx_MPQ, sizeLutMPQ, offsetCK, kEntry, tidIX, trs<00|04|08|12>, partialK

      89-95 ~ tidIY<0|8>, tidFY<0-3>, partialCTRS

        0-7 : Out0_<0-1>, Out1_<0-1>, Out2_<0-1>, Out3_<0-1>
      16-31 ~ s<0-3>
      64-79 ~ s0_<0-3>, s1_<0-3>, s2_<0-3>, s3_<0-3>
      80-83 : offsetK<0-3>
      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7

    112-159 ~ tid31, tid32, alpha, readOs, lutMPQ, offsetMPQ, k<0-3>, lutK, offsetO<0-3>


</REGISTER_MAPPING>

--:-:2:-:1      S2R idx_Blk, SR_CTAID.X;
--:-:1:-:1      S2R tid,     SR_TID.X;
--:-:3:-:1      S2R idx_N,   SR_CTAID.Y;

02:-:-:-:6      LEA      Block0.CC, idx_Blk,   param_Block[0],     4;
--:-:-:-:2      LEA.HI.X Block1,    idx_Blk,   param_Block[1], RZ, 4;

--:-:2:-:1      LDG.E.CI.128 block, [Block];

<SCHEDULE_BLOCK>
// P0 = tid >= 32
01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;

<ORDERED>
--:-:-:-:1      STS.128 [RZ], RZ;
[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15; +]

// idx_K/idx_MPQ, block_K/block_C, block_CK, block_F
// 02:-:-:-:1      SHR.U32 idx_K,   block0, 16;
// 02:-:-:-:1      LOP.AND idx_MPQ, block0, 0xffff;
// 02:-:-:-:1      SHR.U32 block_K, block1, 16;
// 02:-:-:-:1      LOP.AND block_C, block1, 0xffff;
// 02:-:-:-:1      MOV block_CK,    block2;
// 02:-:-:-:1      MOV block_F,     block3;

02:-:-:-:1      LOP.AND block_C, block1, 0xffff;
--:-:-:-:1      SHR.U32 idx_K, block0, 16;
--:-:-:-:1      MOV block_F, block3;
--:-:-:-:1      SHR.U32 block_K, block1, 16;
--:-:-:-:1      LOP.AND idx_MPQ, block0, 0xffff;
--:-:-:-:1      MOV block_CK, block2;
</ORDERED>

// tidFX = tid >> 2
// tidFY = (tid & 3) << 2
--:-:-:-:1      SHR.U32 tidFX,  tid,    2;
--:-:-:-:1      LOP.AND tidFY0, tid,    3;
--:-:-:-:1      SHL     tidFY0, tidFY0, 2;
--:-:-:-:1      IADD    tidFY1, tidFY0, 1;
--:-:-:-:1      IADD    tidFY2, tidFY0, 2;
--:-:-:-:1      IADD    tidFY3, tidFY0, 3;
// tidIX = (tid & 7) << 2
// tidIY = tid >> 3
--:-:-:-:1      LOP.AND tidIX,  tid,    7;
--:-:-:-:1      SHL     tidIX,  tidIX,  2;
--:-:-:-:1      SHR.U32 tidIY0, tid,    3;
--:-:-:-:1      IADD    tidIY8, tidIY0, 8;

// The extra tidFY0*2 here is to avoid bank conflicts on write
// writeFs = (tidFY0*32 + tidFX + tidFY0*2) * 4
--:-:-:-:1      ISCADD writeFs, tidFY0,  tidFX,   5;
--:-:-:-:1      ISCADD writeFs, tidFY0,  writeFs, 1;
--:-:-:-:1      SHL    writeFs, writeFs, 2;


// writeIs = (tidIY0*32 + tidIX) * 4
--:-:-:-:1      ISCADD writeIs, tidIY0, tidIX, 5;
--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareF>, 2;



// readFs = ((tid & 8) >> 2) | (tid & 1)
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readFs, tid,    8;
--:-:-:-:1      SHR.U32 readFs, readFs, 2;
--:-:-:-:1      LOP.OR  readFs, readFs, tid1;

// readIs  = (tid >> 1) & 3
--:-:-:-:1      BFE.U32 readIs, tid, 0x201; // 2 bits at position 1

// tid16 = tid & -16
--:-:-:-:1      LOP.AND tid16, tid, -16;

// Arrange 8 tiles horizontally in the I direction
// Add some spacing (readFs << 2) to avoid write bank conflicts
// readIs2 = readIs + (tid16 >> 1) + (readFs << 2)
--:-:-:-:1      SHR.U32 tid16_1, tid16, 1;
--:-:-:-:1      IADD    readIs2, tid16_1, readIs;
--:-:-:-:1      ISCADD  readIs2, readFs,  readIs2, 2;

// readFs  <<= 4
// readIs  <<= 4
// readIs2 <<= 4
--:-:-:-:1      SHL readFs,  readFs,  4;
--:-:-:-:1      SHL readIs,  readIs,  4;
--:-:-:-:1      SHL readIs2, readIs2, 4;

// writeOs = readFs*32*4 + readIs2
--:-:-:-:1      ISCADD writeOs, readFs, readIs2, 7;

// Each block of 16 threads works on 4 lines
// readFs += tid16 / 4 * 32 * 4
// readIs += tid16 / 4 * 32 * 4
--:-:-:-:1      ISCADD readFs, tid16, readFs, 5;
--:-:-:-:1      ISCADD readIs, tid16, readIs, 5;

// Shift each group of 16 theads over by 8 when not in contig mode.
// readFs += tid16 / 2  * 4
--:-:-:-:1      ISCADD readFs, tid16, readFs, 1;

// Offset readIs by size of ShareF
--:-:-:-:1      IADD  readIs, readIs, 4x<szShareF>;

// sizeLutMPQ  = TRS * 32 + 32
// shareLutC   = sizeLutMPQ * 4 + addr_lut
// shareLutK   = block_C * 4 + shareLutC
// shareLutMPQ = tidIX * 4 + addr_lut
--:-:-:-:1      MOV TRS, param_TRS;
--:-:-:-:1      ISCADD sizeLutMPQ, TRS, 32, 5;
--:-:-:-:1      ISCADD shareLutC, sizeLutMPQ, addr_lut, 2;
--:-:-:-:1      ISCADD shareLutK, block_C, shareLutC, 2;
--:-:-:-:1      ISCADD shareLutMPQ, tidIX, addr_lut, 2;

// Align this for LDS.U.128 in STORE_O
--:-:-:-:1      LOP.AND.NZ P1, partialK, shareLutK, 15;
--:-:-:-:1  @P1 IADD3 shareLutK, shareLutK, 16, -partialK;

// CTRS = C * TRS
--:-:-:-:1      XMAD CTRS, block_C, param_TRS, RZ;
--:-:-:-:1      IADD CTRS16, CTRS, 16;

// KCTRS
// k = block_K * 32 + tidFX
// offsetK = k * CTRS + tidFY0 + block_F
--:-:-:-:1      ISCADD k00, idx_K, tidFX, 5;
--:-:-:-:1      IADD block_F, block_F, tidFY0;
--:-:-:-:1      IADD k16, k00, 16;
--:-:-:-:1      XMAD offsetK00, k00, CTRS, block_F;
--:-:-:-:1      XMAD offsetK16, k16, CTRS, block_F;

--:-:-:-:1      LEA      track00F0.CC, offsetK00,   param_F[0],     [+ dshiftF() +];
--:-:-:-:1      LEA.HI.X track00F1,    offsetK00,   param_F[1], RZ, [+ dshiftF() +];

--:-:-:-:1      LEA      track16F0.CC, offsetK16,   param_F[0],     [+ dshiftF() +];
--:-:-:-:0      LEA.HI.X track16F1,    offsetK16,   param_F[1], RZ, [+ dshiftF() +];

// If this value is not a multiple of 16 we want to grab the partial amount on the first fetch.
// If it is a multiple of 16 then make a full 16 line fetch.
--:-:-:-:1      LOP.AND.Z P4, partialCTRS, CTRS, 15;
--:-:-:-:1  @P4 MOV partialCTRS, 16;

// offsetIn = idx_N * CDHW
04:-:-:-:0      XMAD.LO2C offsetIn, idx_N, param_CDHW, RZ;

</SCHEDULE_BLOCK>

--:-:-:-:5  @P0 BRA.U CHANNEL_LUT;

// lutStore  = (tidIY0*32 + tidIX) * 4 + addr_lut - 16*32*4
// mpqOffset = sizeLutMPQ * idx_MPQ + tidIY0*32 + tidIX
<SCHEDULE_BLOCK>
--:-:-:-:1      IADD trs00, tidIY0, 0;
--:-:-:-:1      IADD trs04, tidIY0, 4;
--:-:-:-:1      IADD trs08, tidIY0, 8;
--:-:-:-:1      IADD trs12, tidIY0, 12;

--:-:-:-:1      ISCADD lutStore, tidIY0, tidIX, 5;
--:-:-:-:1      ISETP.LE.AND P0, PT, trs00, param_TRS, PT; // LE to get single output map row as well
--:-:-:-:1      XMAD mpqOffset, sizeLutMPQ, idx_MPQ, lutStore;
--:-:-:-:4      ISCADD lutStore, lutStore, 4x<(szShareF + szShareI)*2 - 16*32>, 2;
</SCHEDULE_BLOCK>


MPQ_LOOP:

<SCHEDULE_BLOCK>
--:-:-:-:1      ISETP.LE.AND P1, PT, trs04, param_TRS, PT;
--:-:-:-:1      ISETP.LE.AND P2, PT, trs08, param_TRS, PT;
--:-:-:-:1      ISETP.LE.AND P3, PT, trs12, param_TRS, PT;

--:-:-:-:1      LEA      Lut0.CC, mpqOffset, param_LutMPQ[0],     2;
--:-:-:-:1      LEA.HI.X Lut1,    mpqOffset, param_LutMPQ[1], RZ, 2;

<ORDERED>
01:-:1:-:1  @P0 LDG.E.CI.128 lutEntry00, [Lut + 4x< 0*32>];
02:-:2:-:1  @P1 LDG.E.CI.128 lutEntry04, [Lut + 4x< 4*32>];
04:-:3:-:1  @P2 LDG.E.CI.128 lutEntry08, [Lut + 4x< 8*32>];
08:-:4:-:1  @P3 LDG.E.CI.128 lutEntry12, [Lut + 4x<12*32>];
--:-:-:-:1      IADD lutStore, lutStore, 4x<16*32>;
</ORDERED>

--:-:-:-:1      IADD trs00, trs00, 16;
--:-:-:-:1      IADD trs04, trs04, 16;
--:-:-:-:1      IADD trs08, trs08, 16;
--:-:-:-:1      IADD trs12, trs12, 16;
--:-:-:-:1      IADD mpqOffset, mpqOffset, 1x<16*32>;

01:1:-:-:1  @P0 STS.128 [lutStore + 4x< 0*32>], lutEntry00;
02:2:-:-:1  @P1 STS.128 [lutStore + 4x< 4*32>], lutEntry04;
04:3:-:-:1  @P2 STS.128 [lutStore + 4x< 8*32>], lutEntry08;
08:4:-:Y:b  @P3 STS.128 [lutStore + 4x<12*32>], lutEntry12;

--:-:-:-:1      ISETP.LE.AND P0, PT, trs00, param_TRS, PT;
</SCHEDULE_BLOCK>
--:-:-:-:5  @P0 BRA.U MPQ_LOOP;
--:-:-:-:5      BRA.U END_LUT;

CHANNEL_LUT:

<SCHEDULE_BLOCK>
// Load all C lut entries and the 32 values of K we need
// ck0 = tid & 31
// P4  = idx_K*32 + ck0 < block_K
// offsetK = block_CK + block_C + idx_K*32 + ck0
--:-:-:-:1      LOP.AND ck0, tid, 31;
--:-:-:-:1      IADD    ck1, ck0, 1x<32>;
--:-:-:-:1      IADD    ck2, ck0, 2x<32>;
--:-:-:-:1      IADD    ck3, ck0, 3x<32>;
--:-:-:-:1      ISETP.LT.AND P0, PT, ck0, block_C, PT;
--:-:-:-:1      ISCADD offsetCK, idx_K, ck0, 5;
--:-:-:-:1      ISETP.LT.AND P4, PT, offsetCK, block_K, PT;
--:-:-:-:1      IADD3  offsetCK, block_CK, block_C, offsetCK;
--:-:-:-:1      LEA      Lut0.CC, offsetCK, param_LutCK[0],     2;
--:-:-:-:1      LEA.HI.X Lut1,    offsetCK, param_LutCK[1], RZ, 2;
--:5:-:-:1  @P4 LDG.E.CI.32 kEntry, [Lut];

// offsetC = block_CK + ck0
--:-:-:-:1      IADD  offsetCK, block_CK, ck0;
</SCHEDULE_BLOCK>

CHANNEL_LOOP:

--:-:-:-:2      ISETP.LT.AND P1, PT, ck1, block_C, PT;
--:-:-:-:2      ISETP.LT.AND P2, PT, ck2, block_C, PT;
--:-:-:-:2      ISETP.LT.AND P3, PT, ck3, block_C, PT;

10:-:-:-:6      LEA      Lut0.CC, offsetCK, param_LutCK[0],     2;
--:-:-:-:2      LEA.HI.X Lut1,    offsetCK, param_LutCK[1], RZ, 2;

01:-:-:-:0      ISCADD  lutStore, ck0, shareLutC, 2;

--:-:1:-:1  @P0 LDG.E.CI.32 cEntry0, [Lut + 4x<0*32>];
--:-:-:-:0      IADD ck0, ck0, 4x<32>;
--:-:2:-:1  @P1 LDG.E.CI.32 cEntry1, [Lut + 4x<1*32>];
--:-:-:-:0      IADD ck1, ck1, 4x<32>;
--:-:3:-:1  @P2 LDG.E.CI.32 cEntry2, [Lut + 4x<2*32>];
--:-:-:-:0      IADD ck2, ck2, 4x<32>;
--:-:4:-:1  @P3 LDG.E.CI.32 cEntry3, [Lut + 4x<3*32>];
--:-:-:-:0      IADD ck3, ck3, 4x<32>;

01:-:-:-:3  @P0 STS [lutStore + 4x<0*32>], cEntry0;
--:-:-:-:0      ISETP.LT.AND P0, PT, ck0, block_C, PT;
02:-:-:-:1  @P1 STS [lutStore + 4x<1*32>], cEntry1;
--:-:-:-:0      IADD offsetCK, offsetCK, 4x<32>;
04:-:-:-:1  @P2 STS [lutStore + 4x<2*32>], cEntry2;
08:1:-:Y:b  @P3 STS [lutStore + 4x<3*32>], cEntry3;

--:-:-:-:5  @P0 BRA.U CHANNEL_LOOP;

--:-:-:-:6  @P4 LOP.AND ck0, tid, 31;
01:-:-:-:4  @P4 ISCADD  lutStore, ck0, shareLutK, 2;
--:1:-:-:2  @P4 STS [lutStore], kEntry;

END_LUT:

0f:-:-:-:5      BAR.SYNC 0;

// posCTRS = tidY
--:-:-:-:0      MOV posCTRS, tidIY0;
--:-:-:-:6      CAL DO_LOADS;
--:-:-:-:0      ISETP.LT.AND P5, PT, posCTRS, CTRS, PT;
[+
    return F16() ? q{
04:-:-:-:1      F2F.F32.F16 F00_0, F00_0;
--:-:-:-:1      F2F.F32.F16 F00_1, F00_1;
--:-:-:-:1      F2F.F32.F16 F00_2, F00_2;
--:-:3:-:1      F2F.F32.F16 F00_3, F00_3;

08:-:-:-:1      F2F.F32.F16 F16_0, F16_0;
--:-:-:-:1      F2F.F32.F16 F16_1, F16_1;
--:-:-:-:1      F2F.F32.F16 F16_2, F16_2;
--:-:4:-:1      F2F.F32.F16 F16_3, F16_3;
    } : '';
+]
[+
    return I16() ? q{
10:-:-:-:1      F2F.F32.F16 I00, I00;
--:-:-:-:1      F2F.F32.F16 I01, I01;
--:-:-:-:1      F2F.F32.F16 I02, I02;
--:-:5:-:1      F2F.F32.F16 I03, I03;

20:-:-:-:1      F2F.F32.F16 I80, I80;
--:-:-:-:1      F2F.F32.F16 I81, I81;
--:-:-:-:1      F2F.F32.F16 I82, I82;
--:-:6:-:1      F2F.F32.F16 I83, I83;
    } : '';
+]
04:-:-:-:1      STS [writeFs + 4x<0*32 +  0>], F00_0;
--:-:-:-:0      LEA    track00F0.CC, partialCTRS, track00F0, [+ dshiftF() +];
--:-:-:-:1      STS [writeFs + 4x<1*32 +  0>], F00_1;
--:-:-:-:1      STS [writeFs + 4x<2*32 +  0>], F00_2;
--:-:-:-:1      STS [writeFs + 4x<3*32 +  0>], F00_3;
08:-:-:-:1      STS [writeFs + 4x<0*32 + 16>], F16_0;
--:-:-:-:1      STS [writeFs + 4x<1*32 + 16>], F16_1;
--:-:-:-:1      STS [writeFs + 4x<2*32 + 16>], F16_2;
--:-:-:-:0      IADD.X track00F1, track00F1, RZ;
--:-:-:-:1      STS [writeFs + 4x<3*32 + 16>], F16_3;
--:-:-:-:0      LEA    track16F0.CC, partialCTRS, track16F0, [+ dshiftF() +];
10:-:-:-:1      STS.128 [writeIs + 4x<0*32>], I0;
20:-:-:-:1      STS.128 [writeIs + 4x<8*32>], I8;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      IADD.X track16F1, track16F1, RZ;

<SCHEDULE_BLOCK>
--:-:-:-:1      IADD32I writeFs, writeFs, 4x<szShareF + szShareI>;
--:-:-:-:1      IADD32I writeIs, writeIs, 4x<szShareF + szShareI>;
--:-:-:-:1      MOV32I swapBuf, -4x<szShareF + szShareI>;
--:-:-:-:1      SEL partialCTRS, RZ, 16, !P5;

--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>];
--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*32 + 00>];
--:-:-:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>];
--:-:1:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*32 + 16>];
</SCHEDULE_BLOCK>

--:-:-:-:5      CAL DO_LOADS;
--:-:-:-:5      BRA.U LOOP;


DO_LOADS:

<SCHEDULE_BLOCK>
--:-:-:-:1      ISETP.LT.AND P0, PT, tidIY0, partialCTRS, PT;
--:-:-:-:1      ISETP.LT.AND P4, PT, tidIY8, partialCTRS, PT;

--:-:-:-:1      XMAD c0, posCTRS, param_magic_TRS, RZ;
--:-:-:-:1      SHR.U32 c0, c0, param_shift_TRS;
--:-:-:-:1      VMAD.U16.U16 trs0, -c0, TRS, posCTRS;
--:-:-:-:1      ISCADD shareC0, c0, shareLutC, 2;
--:-:-:-:1      ISCADD shareMPQ0, trs0, shareLutMPQ, 7;

--:-:-:-:1      IADD posCTRS, posCTRS, 8;

--:-:-:-:1      XMAD c8, posCTRS, param_magic_TRS, RZ;
--:-:-:-:1      SHR.U32 c8, c8, param_shift_TRS;
--:-:-:-:1      VMAD.U16.U16 trs8, -c8, TRS, posCTRS;
--:-:-:-:1      ISCADD shareC8, c8, shareLutC, 2;
--:-:-:-:1      ISCADD shareMPQ8, trs8, shareLutMPQ, 7;

--:-:-:-:1      IADD3 posCTRS, posCTRS, -8, partialCTRS;

<ORDERED>
--:-:-:-:1  @P0 LDS.U.32  offset0Ic, [shareC0];
--:-:5:-:1  @P0 LDS.U.128 slice0I,   [shareMPQ0];
--:-:-:-:1  @P4 LDS.U.32  offset8Ic, [shareC8];
--:-:6:-:1  @P4 LDS.U.128 slice8I,   [shareMPQ8];
</ORDERED>

// k < block_K
--:-:-:-:1      ISETP.LT.AND P5, PT, k00, block_K, PT;
--:-:-:-:1      ISETP.LT.AND P6, PT, k16, block_K, PT;

--:-:-:-:1      ISETP.LT.AND P0, PT, tidFY0, partialCTRS, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, tidFY1, partialCTRS, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, tidFY2, partialCTRS, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, tidFY3, partialCTRS, P5;
--:-:-:-:1 @!P0 MOV F00_0, RZ;
--:-:-:-:1 @!P1 MOV F00_1, RZ;
--:-:-:-:1 @!P2 MOV F00_2, RZ;
--:-:-:-:1 @!P3 MOV F00_3, RZ;
<ORDERED>
--:-:-:-:1  @P0 LDG.E.CI.[+ dtypeF() +] F00_0, [track00F + 1x<0*$dsizeF>];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtypeF() +] F00_1, [track00F + 1x<1*$dsizeF>];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtypeF() +] F00_2, [track00F + 1x<2*$dsizeF>];
--:-:3:-:1  @P3 LDG.E.CI.[+ dtypeF() +] F00_3, [track00F + 1x<3*$dsizeF>];
</ORDERED>

--:-:-:-:1      ISETP.LT.AND P0, PT, tidFY0, partialCTRS, P6;
--:-:-:-:1      ISETP.LT.AND P1, PT, tidFY1, partialCTRS, P6;
--:-:-:-:1      ISETP.LT.AND P2, PT, tidFY2, partialCTRS, P6;
--:-:-:-:1      ISETP.LT.AND P3, PT, tidFY3, partialCTRS, P6;
--:-:-:-:1 @!P0 MOV F16_0, RZ;
--:-:-:-:1 @!P1 MOV F16_1, RZ;
--:-:-:-:1 @!P2 MOV F16_2, RZ;
--:-:-:-:1 @!P3 MOV F16_3, RZ;
<ORDERED>
--:-:-:-:1  @P0 LDG.E.CI.[+ dtypeF() +] F16_0, [track16F + 1x<0*$dsizeF>];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtypeF() +] F16_1, [track16F + 1x<1*$dsizeF>];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtypeF() +] F16_2, [track16F + 1x<2*$dsizeF>];
--:-:4:-:1  @P3 LDG.E.CI.[+ dtypeF() +] F16_3, [track16F + 1x<3*$dsizeF>];
</ORDERED>

--:-:-:-:1      ISETP.LT.AND P5, PT, tidIY0, partialCTRS, PT;

10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;
--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;
--:-:-:-:1      IADD3 slice0I0, slice0I0, offset0Ic, offsetIn;
--:-:-:-:1      IADD3 slice0I1, slice0I1, offset0Ic, offsetIn;
--:-:-:-:1      IADD3 slice0I2, slice0I2, offset0Ic, offsetIn;
--:-:-:-:1      IADD3 slice0I3, slice0I3, offset0Ic, offsetIn;
--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1      LEA      track0I4.CC, slice0I2,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1      LEA      track0I6.CC, slice0I3,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1 @!P0 MOV I00, RZ;
--:-:-:-:1 @!P1 MOV I01, RZ;
--:-:-:-:1 @!P2 MOV I02, RZ;
--:-:-:-:1 @!P3 MOV I03, RZ;
<ORDERED>
--:-:-:-:1  @P0 LDG.E.CI.[+ dtypeI() +] I00, [track0I0];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtypeI() +] I01, [track0I2];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtypeI() +] I02, [track0I4];
--:-:5:-:1  @P3 LDG.E.CI.[+ dtypeI() +] I03, [track0I6];
</ORDERED>

20:-:-:-:1      ISETP.GE.AND P0, PT, slice8I0, RZ, P4;
--:-:-:-:1      ISETP.GE.AND P1, PT, slice8I1, RZ, P4;
--:-:-:-:1      ISETP.GE.AND P2, PT, slice8I2, RZ, P4;
--:-:-:-:1      ISETP.GE.AND P3, PT, slice8I3, RZ, P4;
--:-:-:-:1      IADD3 slice8I0, slice8I0, offset8Ic, offsetIn;
--:-:-:-:1      IADD3 slice8I1, slice8I1, offset8Ic, offsetIn;
--:-:-:-:1      IADD3 slice8I2, slice8I2, offset8Ic, offsetIn;
--:-:-:-:1      IADD3 slice8I3, slice8I3, offset8Ic, offsetIn;
--:-:-:-:1      LEA      track8I0.CC, slice8I0,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track8I1,    slice8I0,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1      LEA      track8I2.CC, slice8I1,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track8I3,    slice8I1,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1      LEA      track8I4.CC, slice8I2,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track8I5,    slice8I2,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1      LEA      track8I6.CC, slice8I3,   param_I[0],     [+ dshiftI() +];
--:-:-:-:1      LEA.HI.X track8I7,    slice8I3,   param_I[1], RZ, [+ dshiftI() +];
--:-:-:-:1 @!P0 MOV I80, RZ;
--:-:-:-:1 @!P1 MOV I81, RZ;
--:-:-:-:1 @!P2 MOV I82, RZ;
--:-:-:-:1 @!P3 MOV I83, RZ;
<ORDERED>
--:-:-:-:1  @P0 LDG.E.CI.[+ dtypeI() +] I80, [track8I0];
--:-:-:-:1  @P1 LDG.E.CI.[+ dtypeI() +] I81, [track8I2];
--:-:-:-:1  @P2 LDG.E.CI.[+ dtypeI() +] I82, [track8I4];
--:-:6:-:1  @P3 LDG.E.CI.[+ dtypeI() +] I83, [track8I6];
</ORDERED>
</SCHEDULE_BLOCK>

--:-:-:-:5      RET;


LOOP:
--:-:-:-:1      ISETP.LT.AND P5, PT, posCTRS, CTRS,   PT;
--:-:-:-:1      XMAD c0, posCTRS, param_magic_TRS, RZ;
--:-:-:-:1      ISETP.LT.AND P6, PT, posCTRS, CTRS16, PT;
[+
    our ($dtypeF, $dshiftF, $dsizeF, $dtypeI, $dshiftI, $dsizeI);

    my %insert = (

        j0c3  => "--:-:-:-:1      SHR.U32 c0, c0, param_shift_TRS;\n",
        j0c8  => "--:-:-:-:1  \@P5 VMAD.U16.U16 trs0, -c0, TRS, posCTRS;\n" .
                 "--:-:-:-:1  \@P5 ISCADD shareC0, c0, shareLutC, 2;\n" .
                 "--:-:-:-:1      IADD posCTRS, posCTRS, 8;\n" .
                 "--:-:-:-:1      ISETP.LT.AND P0, PT, k00, block_K, P5;\n",
        j0c10 => "--:-:-:-:1  \@P5 ISCADD shareMPQ0, trs0, shareLutMPQ, 7;\n",

        F16() ? (
            j0c9  => "04:-:-:-:1  \@P6 F2F.F32.F16 F00_0, F00_0;\n",
            j0c11 => "--:-:-:-:1  \@P6 F2F.F32.F16 F00_1, F00_1;\n",
            j0c13 => "--:-:-:-:1  \@P6 F2F.F32.F16 F00_2, F00_2;\n",
            j0c15 => "--:-:3:-:1  \@P6 F2F.F32.F16 F00_3, F00_3;\n",
        ) : (),

        j0c12 => "--:-:-:-:1  \@P5 XMAD c8, posCTRS, param_magic_TRS, RZ;\n",
        j0c17 => "--:-:-:-:1  \@P5 SHR.U32 c8, c8, param_shift_TRS;\n",
        j0c22 => "--:-:-:-:1  \@P5 VMAD.U16.U16 trs8, -c8, TRS, posCTRS;\n" .
                 "--:-:-:-:1  \@P5 ISCADD shareC8, c8, shareLutC, 2;\n" .
                 "--:-:-:-:1      IADD posCTRS, posCTRS, 8;\n" .
                 "--:-:-:-:1      ISETP.LT.AND P1, PT, k16, block_K, P5;\n",
        j0c24 => "--:-:-:-:1  \@P5 ISCADD shareMPQ8, trs8, shareLutMPQ, 7;\n",


        j0c23 => "04:-:-:-:1  \@P6 STS [writeFs + 4x<0*32 + 0>], F00_0;\n",
        j0c25 => "--:-:-:-:1  \@P6 STS [writeFs + 4x<1*32 + 0>], F00_1;\n",
        j0c27 => "--:-:-:-:1  \@P6 STS [writeFs + 4x<2*32 + 0>], F00_2;\n",
        j0c29 => "--:3:-:-:1  \@P6 STS [writeFs + 4x<3*32 + 0>], F00_3;\n",

        j0c31 => "--:-:-:-:1  \@P5 LDS.U.32  offset0Ic, [shareC0];\n",
        j0c33 => "--:-:-:-:1  \@P5 LDS.U.128 slice0I,   [shareMPQ0];\n",

        j0c30 => "--:-:-:-:1  \@P0 IADD   track00F0.CC, track00F0, ${dsizeF}x<16>;\n",
        j0c35 => "--:-:-:-:1  \@P0 IADD.X track00F1,    track00F1, RZ;\n",

        j0c45 => "04:-:-:-:1  \@P0 LDG.E.CI.$dtypeF F00_0, [track00F + ${dsizeF}x<0>];\n",
        j0c47 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtypeF F00_1, [track00F + ${dsizeF}x<1>];\n",
        j0c49 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtypeF F00_2, [track00F + ${dsizeF}x<2>];\n",
        j0c51 => "--:-:3:-:1  \@P0 LDG.E.CI.$dtypeF F00_3, [track00F + ${dsizeF}x<3>];\n",

        F16() ? (
            j0c53 => "08:-:-:-:1  \@P6 F2F.F32.F16 F16_0, F16_0;\n",
            j0c55 => "--:-:-:-:1  \@P6 F2F.F32.F16 F16_1, F16_1;\n",
            j0c57 => "--:-:-:-:1  \@P6 F2F.F32.F16 F16_2, F16_2;\n",
            j0c59 => "--:-:4:-:1  \@P6 F2F.F32.F16 F16_3, F16_3;\n",
        ) : (),

        j1c8  => "08:-:-:-:1  \@P6 STS [writeFs + 4x<0*32 + 16>], F16_0;\n",
        j1c10 => "--:-:-:-:1  \@P6 STS [writeFs + 4x<1*32 + 16>], F16_1;\n",
        j1c12 => "--:-:-:-:1  \@P6 STS [writeFs + 4x<2*32 + 16>], F16_2;\n",
        j1c14 => "--:4:-:-:1  \@P6 STS [writeFs + 4x<3*32 + 16>], F16_3;\n",

        j1c16 => "--:-:-:-:1  \@P5 LDS.U.32  offset8Ic, [shareC8];\n",
        j1c18 => "--:-:2:-:1  \@P5 LDS.U.128 slice8I,   [shareMPQ8];\n",

        j1c15 => "--:-:-:-:1  \@P1 IADD   track16F0.CC, track16F0, ${dsizeF}x<16>;\n",
        j1c20 => "--:-:-:-:1  \@P1 IADD.X track16F1,    track16F1, RZ;\n",

        j1c25 => "08:-:-:-:1  \@P1 LDG.E.CI.$dtypeF F16_0, [track16F + ${dsizeF}x<0>];\n",
        j1c27 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtypeF F16_1, [track16F + ${dsizeF}x<1>];\n",
        j1c29 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtypeF F16_2, [track16F + ${dsizeF}x<2>];\n",
        j1c31 => "--:-:4:-:1  \@P1 LDG.E.CI.$dtypeF F16_3, [track16F + ${dsizeF}x<3>];\n",

        I16() ? (
            j1c40 => "10:-:-:-:1  \@P6 F2F.F32.F16 I00, I00;\n",
            j1c42 => "--:-:-:-:1  \@P6 F2F.F32.F16 I01, I01;\n",
            j1c44 => "--:-:-:-:1  \@P6 F2F.F32.F16 I02, I02;\n",
            j1c46 => "--:-:5:-:1  \@P6 F2F.F32.F16 I03, I03;\n",
        ) : (),

        j1c62 => "10:5:-:-:1  \@P6 STS.128 [writeIs + 4x<0*32>], I0;\n",

        j2c5  => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice0I0, slice0I0, offset0Ic, offsetIn;\n" .
                 "--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice0I1, slice0I1, offset0Ic, offsetIn;\n" .
                 "--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice0I2, slice0I2, offset0Ic, offsetIn;\n" .
                 "--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice0I3, slice0I3, offset0Ic, offsetIn;\n" .
                 "--:-:-:-:1  \@P5 LEA      track0I0.CC, slice0I0,   param_I[0],     $dshiftI;\n",
        j2c10 => "--:-:-:-:1  \@P5 LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshiftI;\n",
        j2c11 => "--:-:-:-:1  \@P5 LEA      track0I2.CC, slice0I1,   param_I[0],     $dshiftI;\n",
        j2c16 => "--:-:-:-:1  \@P5 LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshiftI;\n",
        j2c17 => "--:-:-:-:1  \@P5 LEA      track0I4.CC, slice0I2,   param_I[0],     $dshiftI;\n",
        j2c22 => "--:-:-:-:1  \@P5 LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, $dshiftI;\n",
        j2c23 => "--:-:-:-:1  \@P5 LEA      track0I6.CC, slice0I3,   param_I[0],     $dshiftI;\n",
        j2c28 => "--:-:-:-:1  \@P5 LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, $dshiftI;\n",

        j2c25 => "10:-:-:-:1  \@P0 LDG.E.CI.$dtypeI I00, [track0I0];\n",
        j2c27 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtypeI I01, [track0I2];\n",
        j2c29 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtypeI I02, [track0I4];\n",
        j2c31 => "--:-:5:-:1  \@P3 LDG.E.CI.$dtypeI I03, [track0I6];\n",

        I16() ? (
            j2c33 => "20:-:-:-:1  \@P6 F2F.F32.F16 I80, I80;\n",
            j2c35 => "--:-:-:-:1  \@P6 F2F.F32.F16 I81, I81;\n",
            j2c37 => "--:-:-:-:1  \@P6 F2F.F32.F16 I82, I82;\n",
            j2c39 => "--:-:6:-:1  \@P6 F2F.F32.F16 I83, I83;\n",
        ) : (),

        j2c41 => "--:-:-:-:1 \@!P0 I2I.U32.U32 I00, RZ;\n",
        j2c43 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I01, RZ;\n",
        j2c45 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I02, RZ;\n",
        j2c47 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I03, RZ;\n",


        j2c51 => "20:6:-:-:1  \@P6 STS.128 [writeIs + 4x<8*32>], I8;\n",

        j2c52 => "02:-:-:-:1      ISETP.GE.AND P0, PT, slice8I0, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice8I0, slice8I0, offset8Ic, offsetIn;\n" .
                 "--:-:-:-:1      ISETP.GE.AND P1, PT, slice8I1, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice8I1, slice8I1, offset8Ic, offsetIn;\n" .
                 "--:-:-:-:1      ISETP.GE.AND P2, PT, slice8I2, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice8I2, slice8I2, offset8Ic, offsetIn;\n" .
                 "--:-:-:-:1      ISETP.GE.AND P3, PT, slice8I3, RZ, P5;\n" .
                 "--:-:-:-:1  \@P5 IADD3 slice8I3, slice8I3, offset8Ic, offsetIn;\n" .
                 "--:-:-:-:1  \@P5 LEA      track8I0.CC, slice8I0,   param_I[0],     $dshiftI;\n",
        j2c57 => "--:-:-:-:1  \@P5 LEA.HI.X track8I1,    slice8I0,   param_I[1], RZ, $dshiftI;\n",
        j2c58 => "--:-:-:-:1  \@P5 LEA      track8I2.CC, slice8I1,   param_I[0],     $dshiftI;\n",
        j2c63 => "--:-:-:-:1  \@P5 LEA.HI.X track8I3,    slice8I1,   param_I[1], RZ, $dshiftI;\n",
        j3c0  => "--:-:-:-:1  \@P5 LEA      track8I4.CC, slice8I2,   param_I[0],     $dshiftI;\n",
        j3c5  => "--:-:-:-:1  \@P5 LEA.HI.X track8I5,    slice8I2,   param_I[1], RZ, $dshiftI;\n",
        j3c6  => "--:-:-:-:1  \@P5 LEA      track8I6.CC, slice8I3,   param_I[0],     $dshiftI;\n",
        j3c11 => "--:-:-:-:1  \@P5 LEA.HI.X track8I7,    slice8I3,   param_I[1], RZ, $dshiftI;\n",

        j3c8  => "20:-:-:-:1  \@P0 LDG.E.CI.$dtypeI I80, [track8I0];\n",
        j3c10 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtypeI I81, [track8I2];\n",
        j3c12 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtypeI I82, [track8I4];\n",
        j3c14 => "--:-:6:-:1  \@P3 LDG.E.CI.$dtypeI I83, [track8I6];\n",

        j3c24 => "--:-:-:-:1 \@!P0 I2I.U32.U32 I80, RZ;\n",
        j3c26 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I81, RZ;\n",
        j3c28 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I82, RZ;\n",
        j3c30 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I83, RZ;\n",


        j3c15 => "--:-:-:-:5      BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P6 IADD readFs,  readFs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P6 IADD readIs,  readIs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P6 IADD writeFs, writeFs, swapBuf;\n" .
                 "--:-:-:-:1  \@P6 IADD writeIs, writeIs, swapBuf;\n" .
                 "--:-:-:-:1  \@P6 IADD swapBuf, RZ,     -swapBuf;\n",

        j3c63 => "--:-:-:Y:5  \@P6 BRA.U LOOP;\n",


    );
    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }
    my $out;
    foreach my $j (0 .. 3)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 4;
        my $rsPred   = $j == 3 ? '@P6' : '   ';
        my ($c0, $c2, $c4, $c6) = $j == 3 ? (16,18,20,22) : (0,2,4,6);

        $insert{"j${j}c$c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c$c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c$c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c$c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]


<SCHEDULE_BLOCK>
--:-:-:-:1      LOP.AND tid31, tid, 31;
--:-:-:-:1      SHR.U32 tid32, tid, 5;

// lutMPQ = (TRS*32 + tid31)*4 + addr_lut
--:-:-:-:1      ISCADD lutMPQ, TRS, tid31, 5;
--:-:-:-:1      ISCADD lutMPQ, lutMPQ, addr_lut, 2;

--:-:1:-:1      LDS.U.32 offsetMPQ, [lutMPQ];


// readOs = (tid_32*32*4*4 + tid_31 + tid_32*16) * 4
--:-:-:-:1      ISCADD readOs, tid32, tid31, 9;
--:-:-:-:1      ISCADD readOs, tid32, readOs, 4;
--:-:-:-:1      SHL    readOs, readOs, 2;

// k = idx_K*32 + tid_32*4
--:-:-:-:1      SHL tid32, tid32, 2;
--:-:-:-:1      ISCADD  k0, idx_K, tid32, 5;
--:-:-:-:1      IADD    k1, k0, 1;
--:-:-:-:1      IADD    k2, k0, 2;
--:-:-:-:1      IADD    k3, k0, 3;

// lutK = tid32*4*4 + shareLutK + 4x<0|1|2|3>
--:-:-:-:1      ISCADD lutK, tid32, shareLutK, 2;

--:-:-:-:1      MOV alpha, param_alpha;

--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
--:-:-:-:1      FMUL shuffle_x3y1, cx3y1, alpha;
--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
--:-:-:-:1      FMUL shuffle_x7y1, cx7y1, alpha;
--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
--:-:-:-:1      FMUL shuffle_x3y2, cx3y2, alpha;
--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
--:-:-:-:1      FMUL shuffle_x7y2, cx7y2, alpha;
--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
--:-:-:-:1      FMUL shuffle_x3y3, cx3y3, alpha;
--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
--:-:-:-:1      FMUL shuffle_x7y3, cx7y3, alpha;
<ORDERED>
--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y0;
--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y0;
--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y1;
--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y1;
--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y2;
--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y2;

// m < M && p < P && q < Q
01:-:-:-:1      ISETP.GE.AND P4, PT, offsetMPQ, RZ, PT;

--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y3;
--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y3;
</ORDERED>

// O = n*KMPQ + k*MPQ + m*PQ + p*Q + q
// O = n*KMPQ + offsetK + offsetMPQ
--:-:-:-:0      XMAD.LO2C offsetMPQ, idx_N, param_KMPQ, offsetMPQ;
</SCHEDULE_BLOCK>

--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      CAL STORE_O;
--:-:-:-:0      IADD readOs, readOs, 4x<8*128 + 2*16>;
--:-:-:-:5      CAL STORE_O;

--:-:-:-:5      BAR.SYNC 0;

<SCHEDULE_BLOCK>
--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
--:-:-:-:1      FMUL shuffle_x6y4, cx6y4, alpha;
--:-:-:-:1      FMUL shuffle_x7y4, cx7y4, alpha;
--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
--:-:-:-:1      FMUL shuffle_x3y5, cx3y5, alpha;
--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
--:-:-:-:1      FMUL shuffle_x7y5, cx7y5, alpha;
--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
--:-:-:-:1      FMUL shuffle_x3y6, cx3y6, alpha;
--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
--:-:-:-:1      FMUL shuffle_x7y6, cx7y6, alpha;
--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
--:-:-:-:1      FMUL shuffle_x3y7, cx3y7, alpha;
--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
--:-:-:-:1      FMUL shuffle_x7y7, cx7y7, alpha;
--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y4;
--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y4;
--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y5;
--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y5;
--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y6;
--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y6;
--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y7;
--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y7;
</SCHEDULE_BLOCK>
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:0      IADD readOs, readOs, -4x<8*128 + 2*16>;
--:-:-:-:5      CAL STORE_O;
--:-:-:-:0      IADD readOs, readOs,  4x<8*128 + 2*16>;
--:-:-:-:5      CAL STORE_O;

0f:-:-:-:5      EXIT;

STORE_O:

<SCHEDULE_BLOCK>
--:-:-:-:1      ISETP.LT.AND P0, PT, k0, block_K, P4;
--:-:-:-:1      ISETP.LT.AND P1, PT, k1, block_K, P4;
--:-:-:-:1      ISETP.LT.AND P2, PT, k2, block_K, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, k3, block_K, P4;
--:-:-:-:1      IADD k0, k0, 8;
--:-:-:-:1      IADD k1, k1, 8;
--:-:-:-:1      IADD k2, k2, 8;
--:-:-:-:1      IADD k3, k3, 8;

--:-:6:-:1  @P0 LDS.U.128 offsetK, [lutK];
</SCHEDULE_BLOCK>

--:-:-:-:1  @P0 LDS s0_0, [readOs + 4x<0*128 + 0*32>];
--:-:-:-:1  @P0 LDS s0_1, [readOs + 4x<0*128 + 1*32>];
--:-:-:-:1  @P0 LDS s0_2, [readOs + 4x<0*128 + 2*32>];
--:-:1:-:1  @P0 LDS s0_3, [readOs + 4x<0*128 + 3*32>];

--:-:-:-:1  @P1 LDS s1_0, [readOs + 4x<1*128 + 0*32>];
--:-:-:-:1  @P1 LDS s1_1, [readOs + 4x<1*128 + 1*32>];
--:-:-:-:1  @P1 LDS s1_2, [readOs + 4x<1*128 + 2*32>];
--:-:2:-:1  @P1 LDS s1_3, [readOs + 4x<1*128 + 3*32>];

--:-:-:-:1  @P2 LDS s2_0, [readOs + 4x<2*128 + 0*32>];
--:-:-:-:1  @P2 LDS s2_1, [readOs + 4x<2*128 + 1*32>];
--:-:-:-:1  @P2 LDS s2_2, [readOs + 4x<2*128 + 2*32>];
--:-:3:-:1  @P2 LDS s2_3, [readOs + 4x<2*128 + 3*32>];

--:-:-:-:1  @P3 LDS s3_0, [readOs + 4x<3*128 + 0*32>];
--:-:-:-:1  @P3 LDS s3_1, [readOs + 4x<3*128 + 1*32>];
--:-:-:-:1  @P3 LDS s3_2, [readOs + 4x<3*128 + 2*32>];
--:-:4:-:1  @P3 LDS s3_3, [readOs + 4x<3*128 + 3*32>];

<SCHEDULE_BLOCK>
20:-:-:-:1  @P0 IADD offsetO0, offsetK0, offsetMPQ;
--:-:-:-:1  @P1 IADD offsetO1, offsetK1, offsetMPQ;
--:-:-:-:1  @P2 IADD offsetO2, offsetK2, offsetMPQ;
--:-:-:-:1  @P3 IADD offsetO3, offsetK3, offsetMPQ;

--:-:-:-:1  @P0 LEA      Out0_0.CC, offsetO0, param_O[0],     [+ dshiftO() +];
--:-:-:-:1  @P0 LEA.HI.X Out0_1,    offsetO0, param_O[1], RZ, [+ dshiftO() +];
--:-:-:-:1  @P1 LEA      Out1_0.CC, offsetO1, param_O[0],     [+ dshiftO() +];
--:-:-:-:1  @P1 LEA.HI.X Out1_1,    offsetO1, param_O[1], RZ, [+ dshiftO() +];
--:-:-:-:1  @P2 LEA      Out2_0.CC, offsetO2, param_O[0],     [+ dshiftO() +];
--:-:-:-:1  @P2 LEA.HI.X Out2_1,    offsetO2, param_O[1], RZ, [+ dshiftO() +];
--:-:-:-:1  @P3 LEA      Out3_0.CC, offsetO3, param_O[0],     [+ dshiftO() +];
--:-:-:-:1  @P3 LEA.HI.X Out3_1,    offsetO3, param_O[1], RZ, [+ dshiftO() +];

01:-:-:-:1  @P0 FADD s0_0, s0_0, s0_1;
--:-:-:-:1  @P0 FADD s0_2, s0_2, s0_3;

02:-:-:-:1  @P1 FADD s1_0, s1_0, s1_1;
--:-:-:-:1  @P1 FADD s1_2, s1_2, s1_3;

04:-:-:-:1  @P2 FADD s2_0, s2_0, s2_1;
--:-:-:-:1  @P2 FADD s2_2, s2_2, s2_3;

08:-:-:-:1  @P3 FADD s3_0, s3_0, s3_1;
--:-:-:-:1  @P3 FADD s3_2, s3_2, s3_3;

--:-:-:-:1  @P0 FADD s0, s0_0, s0_2;
--:-:-:-:1  @P1 FADD s1, s1_0, s1_2;
--:-:-:-:1  @P2 FADD s2, s2_0, s2_2;
--:-:-:-:1  @P3 FADD s3, s3_0, s3_2;

[+
    if (overlapK())
    {
        # If the output channels overlap across blocks we need to atomic add the results.
        # In the case of fp16 use the top or bottom half of the F16x2 instruction.
        #   This wastes some throughput to L2 but this isn't the bottleneck.
        return O16() ? q{
--:-:1:-:1  @P0 F2F.F16.F32 s0, s0;
--:-:2:-:1  @P1 F2F.F16.F32 s1, s1;
--:-:3:-:1  @P2 F2F.F16.F32 s2, s2;
--:-:4:-:1  @P3 F2F.F16.F32 s3, s3;

--:-:-:-:1      LOP.AND.NZ P5, RZ, Out0_0, 2;
--:-:-:-:1      LOP.AND.NZ P6, RZ, Out1_0, 2;
01:-:-:-:1  @P5 XMAD.PSL.CLO s0, s0, 1, RZ; // same as left shift 16
02:-:-:-:1  @P6 XMAD.PSL.CLO s1, s1, 1, RZ;
--:-:-:-:2  @P5 LOP32I.AND Out0_0, Out0_0, 0xfffffffc;
--:-:-:-:2  @P6 LOP32I.AND Out1_0, Out1_0, 0xfffffffc;

--:-:-:-:1      LOP.AND.NZ P5, RZ, Out2_0, 2;
--:-:-:-:1      LOP.AND.NZ P6, RZ, Out3_0, 2;
04:-:-:-:1  @P5 XMAD.PSL.CLO s2, s2, 1, RZ;
08:-:-:-:1  @P6 XMAD.PSL.CLO s3, s3, 1, RZ;
--:-:-:-:2  @P5 LOP32I.AND Out2_0, Out2_0, 0xfffffffc;
--:-:-:-:2  @P6 LOP32I.AND Out3_0, Out3_0, 0xfffffffc;
<ORDERED>
--:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out0_0], s0;
--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out1_0], s1;
--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out2_0], s2;
--:6:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out3_0], s3;
</ORDERED>

        } : q{
<ORDERED>
--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out0_0], s0;
--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out1_0], s1;
--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out2_0], s2;
--:6:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out3_0], s3;
</ORDERED>
        };
    }
    else
    {
        return O16() ? q{
<ORDERED>
--:-:1:-:1  @P0 F2F.F16.F32 s0, s0;
--:-:2:-:1  @P1 F2F.F16.F32 s1, s1;
--:-:3:-:1  @P2 F2F.F16.F32 s2, s2;
--:-:4:-:1  @P3 F2F.F16.F32 s3, s3;
01:-:-:-:1  @P0 STG.E.CG.U16 [Out0_0], s0;
02:-:-:-:1  @P1 STG.E.CG.U16 [Out1_0], s1;
04:-:-:-:1  @P2 STG.E.CG.U16 [Out2_0], s2;
08:6:-:-:1  @P3 STG.E.CG.U16 [Out3_0], s3;
</ORDERED>

        } : q{
<ORDERED>
--:-:-:-:1  @P0 STG.E.CG.32 [Out0_0], s0;
--:-:-:-:1  @P1 STG.E.CG.32 [Out1_0], s1;
--:-:-:-:1  @P2 STG.E.CG.32 [Out2_0], s2;
--:6:-:-:1  @P3 STG.E.CG.32 [Out3_0], s3;
</ORDERED>
        };
    }
+]

--:-:-:-:1  @P0 IADD lutK, lutK, 4x<8>;
</SCHEDULE_BLOCK>

--:-:-:-:5      RET;
